# Pathways to External Citizenship: 
# The Global Extension of Dual Citizenship and External Voting Rights Across Regime Types
# Sebastián Umpierrez de Reguero 
# Maarten Vink 

# Data description and analysis

# Start with clean workspace
rm(list=ls(all=TRUE))

# ensure that you work in relevant Project in RStudio where the two data files are saved:
# - the main data file used for models m1a:m3b ('pathways_data.csv')
# - and a version with data imputation for models m4a:m4b ('pathways_data_imp')

# required libraries
library(ggseqplot)
library(ggplot2)
library(colorspace)
library(ggh4x)
library(patchwork)
library(gridExtra)
library(gridtext)
library(grid)
library(descr)
#Sequence analysis
library(cluster)
library(pacman)
pacman::p_load(TraMineR, TraMineRextras, cluster, RColorBrewer, devtools, haven, 
               tidyverse, reshape2, WeightedCluster, nnet)
library(broom)
library(expss)
library(factoextra)
library(kableExtra)
library(gtsummary)
library(ggeffects)
library(janitor)
library(knitr)
library(xtable) 
library(gtools)

# Import main data file
data <- read.csv("pathways_data.csv")
summary(data) #10310 obs. of 31 variables
table(data$iso3) #max 61 year observations
table(data$year) #110-195 country observations: 110 countries back to 1960

##################################################
# Descriptive trend plot dj, df, and ldc
##################################################

# Figure 1: descriptive trend plot
dt_tr1 <- data |> 
  group_by(year, evrr_dejure) |>
  summarise (n = n()) |>
  mutate(freq = n / sum(n)) |>
  mutate(extcitvar = "evrr_dejure") |>
  mutate(excitvar_type = "Extraterritorial voting") |>
  filter(evrr_dejure == 1) |>
  select(year, extcitvar, freq, excitvar_type)
dt_tr2 <- data |> 
  group_by(year, evrr_defacto) |>
  summarise (n = n()) |>
  mutate(freq = n / sum(n)) |>
  mutate(extcitvar = "evrr_defacto") |>
  mutate(excitvar_type = "Extraterritorial voting") |>
  filter(evrr_defacto == 1) |>
  select(year, extcitvar, freq, excitvar_type)
dt_tr3 <- data |> 
  filter(!is.na(ldc_bin)) |> # important because there are 327 NA's for ldc_bin
  group_by(year, ldc_bin) |>
  summarise (n = n()) |>
  mutate(freq = n / sum(n)) |>
  mutate(extcitvar = "ldc_bin") |>
  mutate(excitvar_type = "Expatriate dual citizenship") |>
  filter(ldc_bin == 1) |>
  select(year, extcitvar, freq, excitvar_type)
dt_tr0 <- rbind(dt_tr1, dt_tr2)
dt_tr <- rbind(dt_tr0, dt_tr3)

# annote text
dat_text <- data.frame(
  label = c("No restrictions", "De jure", "De facto"),
  excitvar_type   = c("Expatriate dual citizenship", "Extraterritorial voting", "Extraterritorial voting"),
  x     = c(1978, 1980, 2005),
  y     = c(0.35, 0.19, 0.28)
)

jpeg('Fig1.ext.cit.trend.1960-2020.jpeg',  width = 14, height = 8, units = 'in', res = 200)
ggplot(dt_tr, aes(x = year, y = freq)) +
  geom_line(aes(color = extcitvar))+
  facet_grid(~excitvar_type) +
  scale_y_continuous(labels = scales::percent)+
  scale_x_continuous("", limits=c(1960,2021), breaks = seq(1960, 2020, 20))+
  ylab("") +
  xlab("")+
  geom_text(
    data    = dat_text,
    mapping = aes(x = x, y = y, label = label, size = 15))+ # geom_text(aes(x = 1980, y = 0.19, label = "External voting rights (de jure)")) + 
  # geom_text(aes(x = 2005, y = 0.28, label = "External voting rights (de facto)")) + 
  # geom_text(aes(x = 1975, y = 0.45, label = "Expatriate dual citizenship")) + 
  scale_color_manual(values = c("darkgoldenrod1", "darkseagreen4", "darkseagreen4", "darkgoldenrod1"))+ 
  #                               "grey", "grey", "grey"))+
  # geom_text(aes(x = 2021, y = 0.80, label = "80%")) + 
  # geom_text(aes(x = 2021, y = 0.73, label = "73%")) + 
  # geom_text(aes(x = 2021, y = 0.62, label = "62%")) + 
  theme_minimal()+
  theme(text = element_text(size = 20))+
  theme(legend.position="none") 
dev.off()

##################################################
### m1: Main model with evrr_dejure and ldc_bin  

##################################################
# sequence analysis with extcit_dj
##################################################

# Prepare data and change to state sequence format (STS)
# using extcit_djf including any dual citizenship restriction
# include all time invariant covariates
dt_dj <- data |>
  arrange(iso3) |>
  filter(iso3 != 'SSD') |>
  select(iso3, country, year, extcit_dj, 
         mean_v2x_regime, 
         mean_v2x_polyarchy,
         mean_v2x_libdem,
         mean_fh_tot_rev, 
         mean_pop_log,
         mean_gdp_log) |>
  arrange(iso3) |>
  pivot_wider(names_from = year, values_from = extcit_dj)
# 194 obs of 69 vars

# save state sequence file
write.csv(dt_dj, file = "pathways_SA_data_dj.csv", row.names = FALSE)

# Follow example http://mephisto.unige.ch/pub/TraMineR/doc/TraMineR-Users-Guide.pdf
# https://sequenceanalysis.org/2022/08/05/ggseqplot-ggplotify-sequence-plots/
# see also: https://link.springer.com/chapter/10.1007/978-3-031-54464-4_10#Sec15 
state.lab <- c("None", "Only external voting", "Only dual citizenship", "Both")
cpal <- c("brown4", "darkgoldenrod3", "burlywood4", "darkseagreen4")
dt.seq1 <- seqdef(dt_dj,9:69,labels=state.lab, cpal = cpal) #remove 1 row (South Sudan) with NA's in all years!

summary(dt.seq1)
# [>] found missing values ('NA') in sequence data
# [>] preparing 194 sequences
# [>] coding void elements with '%' and missing values with '*'
# [>] 4 distinct states appear in the data: 
#   1 = 0
# 2 = 1
# 3 = 2
# 4 = 3
# [>] state coding:
#   [alphabet]  [label]  [long label] 
# 1  0           0        None
# 2  1           1        Only external voting
# 3  2           2        Only dual citizenship
# 4  3           3        Both
# [>] 194 sequences in the data set
# [>] min/max sequence length: 61/61

# Figure 2: Descriptive sequence plots of states of external citizenship, with expatriate dual citizenship
# acceptance and de jure extraterritorial voting

jpeg('Fig2.m1.extcit.dj.sequence.plots.jpeg',  width = 16, height = 12, units = 'in', res = 100)
par(mfrow = c(2, 2))
seqiplot(dt.seq1, main = "a. Index plot (first 10 sequences)", with.legend = FALSE, cex.main = 2, cex.axis = 1.5)
seqdplot(dt.seq1, main = "c. State distribution plot", with.missing=TRUE, with.legend = FALSE, cex.main = 2, cex.axis = 1.5)
seqfplot(dt.seq1, main = "b. Sequence frequency plot", with.legend = FALSE, pbarw = TRUE, cex.main = 2, cex.axis = 1.5)
seqlegend(dt.seq1, cex = 2)
dev.off()


# Optimal matching (OM) distances
# substitution-cost matrix
# default from https://link.springer.com/chapter/10.1007/978-3-031-54464-4_10#Sec15
# the most straightforward way of computing the cost is to use a constant cost; 
# that is, to assume that the states are equally distant from one another. 
# To do so, we can use the function seqsubm() and supply the argument method=”CONSTANT”. 
# In the following example, we assign a common cost of 2 (via the argument cval). 
# We also refer below to other optional arguments which are not strictly necessary 
# for the present application but nonetheless worth highlighting as options.
substitution_cost_constant <- seqsubm (
  dt.seq1,
  # Sequence object
  method = "CONSTANT",
  # Method to determine costs
  cval = 2,
  # Substitution cost
  time.varying = FALSE, # Does not allow the cost to vary over time
  #time.varying = TRUE, # Allows the cost to vary over time
  with.missing = TRUE,
  # Allows for missingness state
  miss.cost = 1,
  # Cost for substituting a missing state
  weighted = TRUE)
# Allows weights to be used when applicable
# [>] creating 5x5 substitution-cost matrix using 2 as constant value

# To compute the OM dissimilarity matrix, 
# the indel argument needs to be provided 

# We set here at 1.5 based on:
# https://cran.r-project.org/web/packages/seqhandbook/vignettes/Tutorial.html
# For Optimal Matching, the construction of a distance matrix between sequences is done with the seqdist function. 
# This also involves defining a substitution cost matrix between states (with the seqsubm function). 
# Here the substitution costs are constant and equal to 2 and the indel cost is equal to 1.5.

#use the substitution cost matrix previously computed for the dt.seq1 sequence object using the seqsubm() command
dt.om <- seqdist(dt.seq1, method = "OM", indel = 1.5, sm = substitution_cost_constant, with.missing = T)
dt.om
# [>] including missing values as an additional state
# [>] 194 sequences with 5 distinct states
# [>] checking 'sm' (size and triangle inequality)
# [>] 176 distinct sequences 
# [>] min/max sequence lengths: 61/61
# [>] computing distances using the OM metric
# [>] elapsed time: 0.131 secs

# Optimal number of clusters?
# https://sa-book.github.io/rChapter4-2.html
extcit.ward1 <- hclust(as.dist(dt.om), 
                       method = "ward.D")

# Fig SM1. Scree plot
jpeg('FigSM1.excit.dj.screeplot.jpeg',  width = 8, height = 6, units = 'in', res = 200)
fviz_nbclust(dt.om, 
             FUN = hcut, 
             method = "wss",
             barfill = "black",
             barcolor = "black",
             linecolor = "black")
dev.off()
# Interpretation: there is no clearly visible 'elbow' 

# Check cluster quality indicators
# follow https://sa-book.github.io/rChapter4-2.html
extcit.ward.test <- as.clustrange(extcit.ward1, 
                               diss = dt.om,
                               ncluster = 10)
extcit.ward.test
# ASW for all clusters above minimum score of 0.25 
# for sufficient between-group distances and within-group homogeneity

# In order to have sufficiently homogeneous and meaningful clusters 
# but to avoid a variable with too many categories 
# in our small sample multinomial regression analysis (N = 194 countries), 
# we opt for a 5-cluster solution.

# [additional code to explore various cluster solutions removed]
# [to explore alternative solutions, replace cutree(extcit_clusterward, k = 5) in code below]

# 5-cluster solution
# The cluster membership for each sequence is then retrieved. A five clusters solution is chosen here.
extcit_clusterward <- agnes(dt.om, diss = TRUE, method = "ward")

extcit_cluster5 <- cutree(extcit_clusterward, k = 5)
extcit_cluster5 <- factor(extcit_cluster5, 
                          labels = c("Norm setters", "Dual citizenship only", "Norm resisters","External voting only", "Latecomers"))
table(extcit_cluster5) 
# Norm setters    Dual citizenship only        Norm resisters           External voting only            Latecomers 
# 62                        28                    47                    30                    27 

#reorder levels to ensure that this is the same for all models
extcit_cluster5 <- factor(extcit_cluster5, 
                          levels = c("Norm setters", "Dual citizenship only", "External voting only", "Latecomers", "Norm resisters"))

# add percentages to labels based on frequency (out of 194 countries)
extcit_cluster5_perc <- cutree(extcit_clusterward, k = 5)
extcit_cluster5_perc <- factor(extcit_cluster5, labels = c("Norm setters (32.0%)", "Dual citizenship only (14.4%)", "External voting only (15.5%)", "Latecomers (13.9%)", "Norm resisters (24.2%)"))
extcit_cluster5_perc <- factor(extcit_cluster5_perc, levels = c("Norm setters (32.0%)", "Dual citizenship only (14.4%)", "External voting only (15.5%)", "Latecomers (13.9%)", "Norm resisters (24.2%)"))
table(extcit_cluster5_perc)
# extcit_cluster5_perc
# Norm setters (32.0%) 62
# Dual citizenship only (14.4%) 28
# External voting only (15.5%) 30        
# Latecomers (13.9%) 27
# Norm resisters (24.2%) 47 

seqfplot(dt.seq1, group = extcit_cluster5, pbarw = T)
seqmtplot(dt.seq1, group = extcit_cluster5)
seqrplot(dt.seq1, group = extcit_cluster5, diss = dt.om, coverage = 0.35, border = NA)

# Figure 3: External citizenship trajectories: five-cluster solution based on optimal matching algorithm
# with constant substitution costs
# state distribution plots of the typology
jpeg('Fig3.m1.extcit.dj.SA.5.stat.distr.jpeg',  width = 14, height = 14, units = 'in', res = 100)
seqdplot(dt.seq1, with.missing = T, with.legend = T, group = extcit_cluster5, legend.prop = 0.25, cex.main = 2, cex.legend = 2, cex.axis = 1.5)
dev.off()

# Figure 4: External citizenship trajectories: five-cluster solution, mean time in state per cluster
# (missings excluded only for visual purpose)
jpeg('Fig4.m1.extcit.dj.SA.time.5.plots.jpeg',  width = 12, height = 8, units = 'in', res = 100)
seqmtplot(dt.seq1, group = extcit_cluster5, cex.main = 2, cex.legend = 2, cex.axis = 1.5)
dev.off()

# Alternatively can also visualise as
# Modal state sequence plots
# The modal state sequence plots represent, for each cluster, the sequence of modal states for each position in time. 
# At each position in time, the height of the bar is proportional to the frequency of the modal state.
# seqmsplot(dt.seq1, group=extcit_cluster5, cex.legend=2, cex.main = 2)
# and with missings included
# seqmsplot(dt.seq1, group=extcit_cluster5, with.missing = T, with.legend = T, cex.main = 2, cex.legend = 2, info=FALSE)

# Attach the vector with the 5-cluster solution to the main data.frame
# This is necessary for the next step: exploring correlates of clustering
SA.data <- dt_dj
SA.data$extcit_cluster5 <-extcit_cluster5
summary(SA.data)
# save data
write_csv(SA.data, file = "pathways_SA_data_dj.csv")

# prepare data for Table SM6 comparing clustering of countries across models with
# different measures of external voting and without/with data imputation
SM.rob.clustering <- dt_dj |>
  select(c(iso3, country))
SM.rob.clustering$m1 <- extcit_cluster5


###Explore covariates of sequences with multinomial regression

# Table SM2. Summary statistics table 
# Calculate summary statistics with xtable command st() to easily combine numeric and factor variables 
SA.data |>
  #mutate(income_o=droplevels(income_o)) |>
  select(extcit_cluster5,
         mean_fh_tot_rev, mean_v2x_polyarchy, 
         mean_pop_log, mean_gdp_log) |>
  st(out = "latex", file = "TableS2_sum_stat.tex",
     labels = c("Clustering models m1a and m1b",
                "Mean FH political rights and civil liberties", "Mean V-DEM electoral democracy",
                "Mean GDP per capita (logged)","Mean total population (logged)"),
     title = "Summary statistics of variables in analyses for models m1a and m1b")

# label variables
SA.data = apply_labels(SA.data,
                       mean_fh_tot_rev = "FH political rights and civil liberties", #1 NA
                       mean_v2x_polyarchy = "V-DEM electoral democracy", #21 NA's
                       mean_pop_log = "Population (logged)",
                       mean_gdp_log = "GDP per capita (logged)")

#basic multinomial model w continuous variables (FH)
m1a <- multinom(extcit_cluster5 ~ mean_fh_tot_rev + mean_pop_log + mean_gdp_log, data=SA.data)
summary(m1a)
tidy(m1a, conf.int = TRUE) 
library(modelsummary)
modelsummary(m1a)

#exponentiate
tidy(m1a, conf.int = TRUE, exponentiate = TRUE) %>%
  kable() %>%
  kable_styling("basic", full_width = FALSE)

# tabulated with gtsummary, exponentiated  
# install.packages("gtsummary")
tbl_regression(m1a, exp = TRUE)

#as .tex file (to include in markdown)
tbl_regression(m1a, exp = TRUE) %>%
  as_gt() %>%
  gt::gtsave(filename = "m1a.extcit.dj_table.pred.prob.cl5.exp.tex")  

# plot predicted probabilities
# https://strengejacke.github.io/ggeffects/articles/introduction_plotcustomize.html
# ggeffect(m1) to explore all effects
# Prep facet labels
response1a.labs <- c("Norm setters", "Dual citizenship only", 
                   "External voting only", "Latecomers", "Norm resisters (* p<0.05)")
names(response1a.labs) <- c("Norm.setters", "Dual.citizenship.only",
                          "External.voting.only", "Latecomers", "Norm.resisters")
# lay-out vertical version for combi plot Fig 5
p1a_vert <- ggeffect(m1a, terms = "mean_fh_tot_rev") %>%
  plot() + 
  # Change the facet labels
  facet_wrap(~ response.level, nrow = 5,
             labeller = labeller(response.level = response1a.labs))+
  scale_x_continuous(breaks = c(0, 4, 8, 12)) + 
  scale_y_continuous(limits = c(0,0.8), breaks = c(0, 0.2, 0.4, 0.6, 0.8)) + 
  labs(
    x = "\nFH political rights and civil liberties",
    y = "predicted probabilities\n",
    title = "Freedom House") +
  theme_ggeffects(base_size = 14)
p1a_vert  

# Robustness check with v-dem Electoral Dem
m1b <- multinom(extcit_cluster5 ~ mean_v2x_polyarchy + mean_pop_log + mean_gdp_log, data=SA.data)
summary(m1b)
tidy(m1b, conf.int = TRUE) 
stargazer(m1b)

#exponentiate
tidy(m1b, conf.int = TRUE, exponentiate = TRUE) %>%
  kable() %>%
  kable_styling("basic", full_width = FALSE)

#as .tex file (to include in markdown)
tbl_regression(m1b, exp = TRUE) %>%
  as_gt() %>%
  gt::gtsave(filename = "m1b.extcit.dj_table.pred.prob.vd.ed.tex")  

# plot predicted probabilities
# https://strengejacke.github.io/ggeffects/articles/introduction_plotcustomize.html
# ggeffect(m1) to explore all effects
# Prep facet labels
response1b.labs <- c("Norm setters", "Dual citizenship only", 
                   "External voting only", "Latecomers", "Norm resisters (** p<0.01)")
names(response1b.labs) <- c("Norm.setters", "Dual.citizenship.only",
                          "External.voting.only", "Latecomers", "Norm.resisters")
# vertical version
p1b_vert <- ggeffect(m1b, terms = "mean_v2x_polyarchy") %>%
  plot() + 
  # Change the facet labels
  facet_wrap(~ response.level, nrow=5,
             labeller = labeller(response.level = response1b.labs))+
  scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1)) + 
  scale_y_continuous(limits = c(0,0.8), breaks = c(0, 0.2, 0.4, 0.6, 0.8)) + 
  labs(
    x = "\nVD electoral democracy",
    y = "\n",
    title = "V-DEM") +
  theme_ggeffects(base_size = 14) 
p1b_vert  

jpeg('Fig5_m1a_m1b.extcit.dj.pred.prob.cl5.jpeg',  width = 12, height = 10, units = 'in', res = 100)
grid.arrange(p1a_vert, p1b_vert, ncol=2, widths = c(1,1))
dev.off()


##################################################
# m2: sequence analysis with data_df2 
##################################################

# prepare djf2 data and change to state sequence format (STS)
# using extcit2_djf including consistent dual citizenship restriction
# include all time invariant covariates
dt_df <- data |>
  arrange(iso3) |>
  filter(iso3 != 'SSD') |>
  select(iso3, country, year, extcit_df, 
         mean_v2x_polyarchy, mean_fh_tot_rev,
         mean_pop_log, mean_gdp_log) |>
  arrange(iso3) |>
  pivot_wider(names_from = year, values_from = extcit_df)
# 194 obs of 67 vars

# save state sequence file
write.csv(dt_df, file = "pathways_SA_data_df.csv", row.names = FALSE)

#import SA data
#dt_df2 <- read.csv("pathways_SA_data_djf2_2024-10-05.csv", check.names = FALSE)

# follow example http://mephisto.unige.ch/pub/TraMineR/doc/TraMineR-Users-Guide.pdf
# https://sequenceanalysis.org/2022/08/05/ggseqplot-ggplotify-sequence-plots/
# see also: https://link.springer.com/chapter/10.1007/978-3-031-54464-4_10#Sec15 
state.lab <- c("None", "Only external voting (de facto)", "Only dual citizenship", "Both")
cpal <- c("brown4", "darkgoldenrod3", "burlywood4", "darkseagreen4")
dt.seq2 <- seqdef(dt_df,7:67,labels=state.lab, cpal = cpal) #remove 1 row (South Sudan) with NA's in all years!

summary(dt.seq2)
# [>] sequence object created with TraMineR version 2.2-10 
# [>] 194 sequences in the data set, 171 unique 
# [>] min/max sequence length: 61/61
# [>] alphabet (state labels):  
#   1=0 (None)
# 2=1 (Only external voting (de facto))
# 3=2 (Only dual citizenship)
# 4=3 (Both)
# [>] dimensionality of the sequence space: 183 
# [>] colors: 1=brown4 2=darkgoldenrod3 3=burlywood4 4=darkseagreen4 
# [>] symbol for missing state: * 
  
# Optimal matching (OM) distances
# substitution-cost matrix
# default from https://link.springer.com/chapter/10.1007/978-3-031-54464-4_10#Sec15
# the most straightforward way of computing the cost is to use a constant cost; 
# that is, to assume that the states are equally distant from one another. 
# To do so, we can use the function seqsubm() and supply the argument method=”CONSTANT”. 
# In the following example, we assign a common cost of 2 (via the argument cval). 
# We also refer below to other optional arguments which are not strictly necessary 
# for the present application but nonetheless worth highlighting as options.
substitution_cost_constant <- seqsubm (
  dt.seq2,
  # Sequence object
  method = "CONSTANT",
  # Method to determine costs
  cval = 2,
  # Substitution cost
  time.varying = FALSE, # Does not allow the cost to vary over time
  #time.varying = TRUE, # Allows the cost to vary over time
  with.missing = TRUE,
  # Allows for missingness state
  miss.cost = 1,
  # Cost for substituting a missing state
  weighted = TRUE)
# Allows weights to be used when applicable
# [>] creating 5x5 substitution-cost matrix using 2 as constant value

# To compute the OM dissimilarity matrix, 
# the indel argument needs to be provided 

# We set here at 1.5 based on:
# https://cran.r-project.org/web/packages/seqhandbook/vignettes/Tutorial.html
# For Optimal Matching, the construction of a distance matrix between sequences is done with the seqdist function. 
# This also involves defining a substitutio n cost matrix between states (with the seqsubm function). 
# Here the substitution costs are constant and equal to 2 and the indel cost is equal to 1.5.

#use the substitution cost matrix previously computed for the dt.seq2 sequence object using the seqsubm() command
dt.om <- seqdist(dt.seq2, method = "OM", indel = 1.5, sm = substitution_cost_constant, with.missing = T)
dt.om
# [>] including missing values as an additional state
# [>] 194 sequences with 5 distinct states
# [>] checking 'sm' (size and triangle inequality)
# [>] 179 distinct sequences 
# [>] min/max sequence lengths: 61/61
# [>] computing distances using the OM metric
# [>] elapsed time: 0.122 secs

# check clusters
clusterward <- agnes(dt.om, diss = TRUE, method = "ward")

# 5 clusters
#The cluster membership for each sequence is then retrieved. A five clusters solution is chosen here.
extcit_df_cluster5 <- cutree(clusterward, k = 5)
extcit_df_cluster5 <- factor(extcit_df_cluster5, labels = c("Dual citizenship only", "Norm resisters", "Norm setters", "External voting only", "Latecomers"))
table(extcit_df_cluster5)
#reorder levels to ensure that this is the same for all models
extcit_df_cluster5 <- factor(extcit_df_cluster5, levels = c("Norm setters", "Dual citizenship only", "External voting only", "Latecomers", "Norm resisters"))

table(extcit_df_cluster5)
# Norm setters Dual citizenship only  External voting only            Latecomers        Norm resisters 
# 27                    54                    38                    37                    38 

#prepare data for SM table cluster overview
SM.rob.clustering$m2 <- extcit_df_cluster5

###Explore covariates of sequences with multinomial regression

# Attach the vector with the 5-cluster solution to the main data.frame
SA.data <- dt_df
SA.data$extcit_df_cluster5 <-extcit_df_cluster5
summary(SA.data)

# label variables
SA.data = apply_labels(SA.data,
                       mean_fh_tot_rev = "FH political rights and civil liberties",
                       mean_pop_log = "Population (logged)",
                       mean_gdp_log = "GDP per capita (logged)")

#basic multinomial model w FH
m2a <- multinom(extcit_df_cluster5 ~ mean_fh_tot_rev + mean_pop_log + mean_gdp_log, data=SA.data)

# with V-DEM
m2b <- multinom(extcit_df_cluster5 ~ mean_v2x_polyarchy + mean_pop_log + mean_gdp_log, data=SA.data)

#########################
# m3: Alternative sequence analysis operationalisation
# prioritise duration (distribution aver the entire period) with different parameters
# https://cran.r-project.org/web/packages/seqhandbook/vignettes/Tutorial.html
dissim.dur <- seqdist(dt.seq1, method="EUCLID", step=37, sm=substitution_cost_constant,
                      indel=1, with.missing = T)

# # check 5 clusters
extcit_clusterward.dur <- agnes(dissim.dur, diss = TRUE, method = "ward")

extcit_cluster5dur <- cutree(extcit_clusterward.dur, k = 5)
extcit_cluster5dur <- factor(extcit_cluster5dur, labels = c("Norm setters", "Dual citizenship only", "Norm resisters", "Latecomers", "External voting only"))
table(extcit_cluster5dur)
#reorder levels to ensure that this is the same for all models
extcit_cluster5dur <- factor(extcit_cluster5dur, levels = c("Norm setters", "Dual citizenship only", "External voting only", "Latecomers", "Norm resisters"))
table(extcit_cluster5dur)

#Explore covariates of sequences with multinomial regression with duration solution
SA.data$extcit_cluster5dur <-extcit_cluster5dur
write_csv(SA.data, file = "pathways_SA_data_djf.csv")

#prepare data for SM table cluster overview
SM.rob.clustering$m3 <- extcit_cluster5dur

# label variables
SA.data = apply_labels(SA.data,
                       mean_fh_tot_rev = "FH political rights and civil liberties",
                       mean_pop_log = "Population (logged)",
                       mean_gdp_log = "GDP (logged)")

#basic multinomial model w FH
m3a <- multinom(extcit_cluster5dur ~ mean_fh_tot_rev + mean_pop_log + mean_gdp_log, data=SA.data)

#basic multinomial model w FH
m3b <- multinom(extcit_cluster5dur ~ mean_v2x_polyarchy + mean_pop_log + mean_gdp_log, data=SA.data)


###########################

# m4: robustness check 3 with imputed data
# check whether makes a difference to impute values for NA's in early years for quite a few countries

# Import file with imputed data
data <- read.csv(file = "pathways_data_imp.csv") # use imputed data

##################################################
# m4. sequence analysis with data_djf_imp 
##################################################

#prepare DJF data and change to state sequence format (STS)
# using extcit2_djf including consistent dual citizenship restriction
# include all time invariant covariates
dt_dj_imp <- data |>
  arrange(iso3) |>
  filter(iso3 != 'SSD') |>
  select(iso3, country, year, extcit_dj, 
         mean_v2x_polyarchy, mean_fh_tot_rev, 
         mean_pop_log, mean_gdp_log) |>
  arrange(iso3) |>
  pivot_wider(names_from = year, values_from = extcit_dj)
# 194 obs of 67 vars

# save state sequence file
write.csv(dt_dj_imp, file = "pathways_SA_data_dj_imp.csv", row.names = FALSE)

# follow example http://mephisto.unige.ch/pub/TraMineR/doc/TraMineR-Users-Guide.pdf
# https://sequenceanalysis.org/2022/08/05/ggseqplot-ggplotify-sequence-plots/
# see also: https://link.springer.com/chapter/10.1007/978-3-031-54464-4_10#Sec15 
state.lab <- c("None", "Only external voting", "Only dual citizenship", "Both")
cpal <- c("brown4", "darkgoldenrod3", "burlywood4", "darkseagreen4")
dt.seq4 <- seqdef(dt_dj_imp,7:67,labels=state.lab, cpal = cpal) #remove 1 row (South Sudan) with NA's in all years!

summary(dt.seq4)
# [>] sequence object created with TraMineR version 2.2-10 
# [>] 194 sequences in the data set, 168 unique 
# [>] min/max sequence length: 61/61
# [>] alphabet (state labels):  
#   1=0 (None)
# 2=1 (Only external voting (de jure))
# 3=2 (Only dual citizenship)
# 4=3 (Both)
# [>] dimensionality of the sequence space: 183 
# [>] colors: 1=brown4 2=darkgoldenrod3 3=burlywood4 4=darkseagreen4 
# [>] symbol for missing state: * 


#Optimal matching (OM) distancess
#substitution-cost matrix
# default from https://link.springer.com/chapter/10.1007/978-3-031-54464-4_10#Sec15
# the most straightforward way of computing the cost is to use a constant cost; 
# that is, to assume that the states are equally distant from one another. 
# To do so, we can use the function seqsubm() and supply the argument method=”CONSTANT”. 
# In the following example, we assign a common cost of 2 (via the argument cval). 
# We also refer below to other optional arguments which are not strictly necessary 
# for the present application but nonetheless worth highlighting as options.
substitution_cost_constant <- seqsubm (
  dt.seq4,
  # Sequence object
  method = "CONSTANT",
  # Method to determine costs
  cval = 2,
  # Substitution cost
  time.varying = FALSE, # Does not allow the cost to vary over time
  #time.varying = TRUE, # Allows the cost to vary over time
  with.missing = TRUE,
  # Allows for missingness state
  miss.cost = 1,
  # Cost for substituting a missing state
  weighted = TRUE)
# Allows weights to be used when applicable
# [>] creating 5x5 substitution-cost matrix using 2 as constant value

# To compute the OM dissimilarity matrix, 
# the indel argument needs to be provided 

# We set here at 1.5 based on:
# https://cran.r-project.org/web/packages/seqhandbook/vignettes/Tutorial.html
# For Optimal Matching, the construction of a distance matrix between sequences is done with the seqdist function. 
# This also involves defining a substitutio n cost matrix between states (with the seqsubm function). 
# Here the substitution costs are constant and equal to 2 and the indel cost is equal to 1.5.

#use the substitution cost matrix previously computed for the dt.seq4 sequence object using the seqsubm() command
dt.om <- seqdist(dt.seq4, method = "OM", indel = 1.5, sm = substitution_cost_constant, with.missing = T)
dt.om
# [>] including missing values as an additional state
# [>] 194 sequences with 5 distinct states
# [>] checking 'sm' (size and triangle inequality)
# [>] 168 distinct sequences 
# [>] min/max sequence lengths: 61/61
# [>] computing distances using the OM metric
# [>] elapsed time: 0.128 secs

# check clusters
excit_imp_clusterward <- agnes(dt.om, diss = TRUE, method = "ward")

# 5 clusters
# The cluster membership for each sequence is then retrieved. A five clusters solution is chosen here.
excit_imp_cluster5 <- cutree(excit_imp_clusterward, k = 5)
excit_imp_cluster5 <- factor(excit_imp_cluster5, labels = c("Norm setters", "Dual citizenship only", "Norm resisters","External voting only", "Latecomers"))
table(excit_imp_cluster5)
#reorder levels to ensure that this is the same for all models
excit_imp_cluster5 <- factor(excit_imp_cluster5, levels = c("Norm setters", "Dual citizenship only", "External voting only", "Latecomers", "Norm resisters"))
table(excit_imp_cluster5)

# Attach the vector with the 5-cluster solution to the main data.frame
SA.data.imp <- dt_dj_imp
SA.data.imp$excit_imp_cluster5 <-excit_imp_cluster5
summary(SA.data.imp)
write_csv(SA.data.imp, "pathways_SA_data_dj_imp.csv")

#prepare data for SM table cluster overview
SM.rob.clustering$m4 <- excit_imp_cluster5

# calculate % of similarity in clustering type between models
SM.rob.clustering <- SM.rob.clustering |>
  mutate(perc.similar.m1 = ifelse(m1 == m2 & m1 == m3 & m1 == m4, 100, # 70 countries clustering similar to m1 in all 3 alternative models
                               ifelse((m1 == m2 & (m1 == m3 | m1 == m4)) | (m1 == m3 & m1 == m4), 75, #122 countries clustering similar to m1 in 2 alternative models
                                      ifelse(m1 == m2 | m1 == m3 | m1 == m4, 50, #169 countries similar in 1 alternative model
                                               0)))) |> #
  mutate(perc.similar.m2 = ifelse(m2 == m1 & m2 == m3 & m2 == m4, 100, # 70 countries 
                                ifelse((m2 == m1 & (m2 == m3 | m2 == m4)) | (m2 == m3 & m2 == m4), 75, # 120 countries   
                                      ifelse(m2 == m1 | m2 == m3 | m2 == m4, 50, #153 countries
                               0)))) |> 
  mutate(perc.similar.m3 = ifelse(m3 == m1 & m3 == m2 & m3 == m4, 100, # 70 countries 
                                  ifelse((m3 == m1 & (m3 == m2 | m3 == m4)) | (m3 == m2 & m3 == m4), 75, # 130 countries
                                         ifelse(m3 == m1 | m3 == m2 | m3 == m4, 50, #170 countries
                                                0)))) |> 
  mutate(perc.similar.m4 = ifelse(m4 == m1 & m4 == m2 & m4 == m3, 100, # 70 countries 
                                  ifelse((m4 == m1 & (m4 == m2 | m4 == m3)) | (m4== m2 & m4 == m3), 75, #124 countries
                                         ifelse(m4 == m1 | m4 == m2 | m4 == m3, 50, #162 countries
                                                0)))) 
  
SM.rob.clustering <- SM.rob.clustering[order(-SM.rob.clustering$perc.similar.m1, SM.rob.clustering$country), ]

write_csv(SM.rob.clustering, "SM_rob.clustering.csv") 

# Table SM6 : export in .tex
# abbreviate long country names for table
SM.rob.clust.short <- SM.rob.clustering |>
  select(-iso3, -perc.similar.m2, -perc.similar.m3, -perc.similar.m4) |>
  mutate_all(funs(str_replace(., "Dual citizenship only", "DC only"))) |>
  mutate_all(funs(str_replace(., "External voting only", "EV only"))) |>
  mutate_all(funs(str_replace(., "Norm resisters", "Norm resist"))) |>
  mutate_all(funs(str_replace(., "Norm embracers", "Norm embrace"))) |>
  mutate_all(funs(str_replace(., "Latecomers", "Late"))) |>
  mutate_all(funs(str_replace(., "Antigua and Barbuda", "Antigua Barb"))) |>
  mutate_all(funs(str_replace(., "Bosnia and Herzegovina", "Bosnia Herz"))) |>
  mutate_all(funs(str_replace(., "Democratic Republic of the Congo Republic of the", "DR Congo"))) |>
  mutate_all(funs(str_replace(., "Saint Kitts and Nevis", "Saint Kitts N"))) |>
  mutate_all(funs(str_replace(., "Saint Vincent and the Grenadines", "Saint Vincent G"))) |>
  mutate_all(funs(str_replace(., "Sao Tome and Principe", "Sao Tome Pr"))) |>
  mutate_all(funs(str_replace(., "Trinidad and Tobago", "Trinidad Tob"))) |>
  mutate_all(funs(str_replace(., "United Arab Emirates", "UAE"))) |>
  mutate_all(funs(str_replace(., "Congo Republic of the", "Congo Rep"))) |>
  mutate_all(funs(str_replace(., "Papua New Guinea", "Papua NG"))) |>
  mutate_all(funs(str_replace(., "United States of America", "United States"))) |>
  mutate_all(funs(str_replace(., "Central African Republic", "Centr Afr Rep")))

# save table
print(xtable(SM.rob.clust.short, type = "latex"), include.rownames=FALSE, file = "TableSM6.rob.clustering.table.country.clusters5.tex")


### Explore covariates of sequences with multinomial regression

# label variables
SA.data.imp = apply_labels(SA.data.imp,
                       mean_fh_tot_rev = "FH political rights and civil liberties",
                       mean_pop_log = "Population (logged)",
                       mean_gdp_log = "GDP per capita (logged)")

#basic model w continuous variables
m4a <- multinom(excit_imp_cluster5 ~ mean_fh_tot_rev + mean_pop_log + mean_gdp_log, data=SA.data.imp)

#basic model w continuous variables
m4b <- multinom(excit_imp_cluster5 ~ mean_v2x_polyarchy + mean_pop_log + mean_gdp_log, data=SA.data.imp)

# Figure SM2: dotplot summary alternative model effects on clustering - democracy association

# Plot regression output summary for m1a-m4a
# Put all results m1-m4 for mean_fh_tot_rev in a tidy frame 
m1fh <- tidy(m1a, conf.int = TRUE) |>
  filter(term == 'mean_fh_tot_rev') |> 
  mutate(signif = stars.pval(p.value)) |>
  mutate(model = "m1a")
# see: https://stackoverflow.com/questions/48877475/how-can-i-add-stars-to-broom-packages-tidy-function-output
m2fh <- tidy(m2a, conf.int = TRUE) |>
  filter(term == 'mean_fh_tot_rev') |> 
  mutate(signif = stars.pval(p.value)) |>
  mutate(model = "m2a")
m3fh <- tidy(m3a, conf.int = TRUE) |>
  filter(term == 'mean_fh_tot_rev') |> 
  mutate(signif = stars.pval(p.value)) |>
  mutate(model = "m3a")
m4fh <- tidy(m4a, conf.int = TRUE) |>
  filter(term == 'mean_fh_tot_rev') |> 
  mutate(signif = stars.pval(p.value)) |>
  mutate(model = "m4a")
m1_2fh <- rbind(m1fh, m2fh)
m1_3fh <- rbind(m1_2fh, m3fh)
m1_4fh <- rbind(m1_3fh, m4fh)
m1_4fh

# plot coefficients FH for 4 multinominal models
jpeg('FigSM2a_m1a_m4a_coefficient_plot_FH.jpeg',  width = 13, height = 10, units = 'in', res = 100)
ggplot(m1_4fh, aes(x=model, y=exp(estimate), ymin=exp(conf.low), ymax=exp(conf.high), colour = model)) + 
  facet_wrap(~y.level) +
  geom_pointrange() + 
  geom_hline(yintercept=1, linetype=2)+
  scale_y_log10(limits = c(0.6,1.7), breaks = c(0.6,1,1.4))+
  coord_flip() + 
  theme_bw() + 
  theme(text = element_text(size = 20))+
  theme(legend.position="none") +
  ylab("\nFH political rights and civil liberties (exponentiated coefficient estimates and standard errors)") +
  xlab("")
dev.off()


# Plot regression output summary for m1b-m4b
# Put all results m1-m4 for mean_v2x_polyarchy in a tidy frame 
m1vd <- tidy(m1b, conf.int = TRUE) |>
  filter(term == 'mean_v2x_polyarchy') |> 
  mutate(signif = stars.pval(p.value)) |>
  mutate(model = "m1b")
# see: https://stackoverflow.com/questions/48877475/how-can-i-add-stars-to-broom-packages-tidy-function-output
m2vd <- tidy(m2b, conf.int = TRUE) |>
  filter(term == 'mean_v2x_polyarchy') |> 
  mutate(signif = stars.pval(p.value)) |>
  mutate(model = "m2b")
m3vd <- tidy(m3b, conf.int = TRUE) |>
  filter(term == 'mean_v2x_polyarchy') |> 
  mutate(signif = stars.pval(p.value)) |>
  mutate(model = "m3b")
m4vd <- tidy(m4b, conf.int = TRUE) |>
  filter(term == 'mean_v2x_polyarchy') |> 
  mutate(signif = stars.pval(p.value)) |>
  mutate(model = "m4b")
m1_2vd <- rbind(m1vd, m2vd)
m1_3vd <- rbind(m1_2vd, m3vd)
m1_4vd <- rbind(m1_3vd, m4vd)
m1_4vd

# plot coefficients V-DEM for 4 multinominal models
jpeg('FigSM2b_m1b_m4b_coefficient_plot_VD.jpeg',  width = 13, height = 10, units = 'in', res = 100)
ggplot(m1_4vd, aes(x=model, y=exp(estimate), ymin=exp(conf.low), ymax=exp(conf.high), colour = model)) + 
  facet_wrap(~y.level) +
  geom_pointrange() + 
  geom_hline(yintercept=1, linetype=2)+
  scale_y_log10()+
  coord_flip() + 
  theme_bw() + 
  theme(text = element_text(size = 20))+
  theme(legend.position="none") +
  ylab("\nV-DEM electoral democracy (exponentiated coefficient estimates and standard errors)") +
  xlab("")
dev.off()


### END ###

